import pandas as pd
import openpyxl
import re
xlsx = '新能源行业.xlsx'
df = pd.read_excel(xlsx)
exf = openpyxl.load_workbook(xlsx)
sheet = exf.active
C2 = sheet['C2']
C = sheet['C']
links = [c.value for c in C]
link_1=links[1:-1]
link_2=''.join(link_1)
p= re.compile('"(.*?),"(.*?)"')
list_of_tuple=p.findall(link_2)
df2=pd.DataFrame({'link':[t[0]for t in list_of_tuple],
'f_name':[t[1]for t in list_of_tuple]})
df2.to_csv('新能源行业.csv')
import re
import pandas as pd
import os
df=pd.read_csv('新能源行业.csv')
p=re.compile('(?<=\d{4})(年报)|(年年报)')
f_names=[p.sub(('年年度报'), f)for f in df.f_name]
df['f_name']=f_names;del p,f_names
def filter_links(words,df,include=True):
Is=[]
for word in words:
if include:
Is.append([word in f for f in df.f_name])
else:
Is.append([word not in f for f in df.f_name])
index=[]
for r in range(len(df)):
flag=not include
for c in range(len(words)):
if include:
flag = flag or Is[c][r]
else:
flag = flag and Is[c][r]
index.append(flag)
df2 = df[index]
return(df2)
df_all = filter_links(['摘要','询问函','社会责任','审计','财务','风险','债券'],df,include=False)
df_orig = filter_links(['(','('], df_all,include=False)
df_updt = filter_links(['(','('], df_all,include=True)
df_updt = filter_links(['取消'],df_updt,include=False)
def sub_with_update(df_updt,df_orig):
df_newest = df_orig.copy()
index_orig = []
index_updt = []
for i,f in enumerate(df_orig.f_name):
for j,fn in enumerate(df_updt.f_name):
if f in fn:
index_orig.append(i)
index_updt.append(j)
#return((index_orig,index_updt))
for n in range(len(index_orig)):
i = index_orig[n]
j = index_updt[n]
df_newest.iloc[i,-2] = df_updt.iloc[j,-2]
#df_newest.iloc[i,-1] = df_updt.iloc[j,-1]
return(df_newest)
df_newest = sub_with_update(df_updt,df_orig)
#index_orig,index_updt = sub_with_update(df_updt, df_orig)
df_newest.sort_values(by=['f_name'],inplace=True,ignore_index=True)
df_newest['公司简称'] = [f[:4] for f in df_newest.f_name]
counts = df_newest['公司简称'].value_counts()
ten_company = []
for cn in counts.index[:10]:
ten_company.append(filter_links([cn], df_newest))
if not os.path.exists('10companies'):
os.makedirs('10companies')
for df_com in ten_company:
cn = df_com['公司简称'].iloc[0]
df_com.to_csv('10companies/%s.csv' % cn)
ten_csv = os.listdir('10companies')
import fitz
import os
os.chdir(r"C:\Users\dell\Desktop\limengfei\limengfei\final-project\10companies")
filenames = os.listdir() #把代码所在文件夹的所有文件和文件名查找出来
finalfix = '年年度报告'
pdf_list = [f for f in filenames if f.endswith('年年度报告.pdf')]
years = [f[-13:-9] for f in pdf_list]
pdf_list
import fitz
import os
import re
import pandas as pd
os.chdir(r"C:\Users\dell\Desktop\limengfei\limengfei\final-project\10companies")
filenames = os.listdir() #把代码所在文件夹的所有文件和文件名查找出来
finalfix = '年年度报告'
pdf_list = [f for f in filenames if f.endswith('年年度报告.pdf')]
years = [f[-13:-9] for f in pdf_list]
pdf_list
def extract_data(pdf):
idx = pdf.find(':')
company_name = pdf[0:idx]
year = pdf[idx+1:idx+5]
#
doc = fitz.open(pdf)
text = [page.get_text() for page in doc]
text = ''.join(text)
#
p_s = re.compile(r'(?<=\n)\w{1,2}、.*?会计数据和财务指标\s*?(?=\n)')
section_match = p_s.search(text)
s_idx = section_match.start()
#
p = re.compile('营业收入(.*?)归属于',re.DOTALL)
data_line = p.search(text[s_idx:]).group()
data_line = data_line.replace('\n', '')
p_digit = re.compile(r'(-)?\d[,0-9]*?\.\d{1,2}')
revenue = p_digit.search(data_line).group()
revenue = revenue.replace(',','')
return((company_name,year,revenue))
companies, years, revenues = [],[],[]
for pdf in pdf_list:
company, year, revenue = extract_data(pdf)
companies.append(company)
years.append(year)
revenues.append(revenue)
df = pd.DataFrame({'company': companies,
'year': years,
'revenue': revenues})
df.sort_values('company',ignore_index=True, inplace=True)
df
x = [2019,2020]
y = [1513568678.62,1458378294.45]
plt.title('revenue changing during 2019-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
x = [2019,2020]
y = [4044447405.35,4455989703.12]
plt.title('revenue changing during 2019-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
x = [2019,2020]
y = [753352460.75,729359322.62]
plt.title('revenue changing during 2019-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
x = [2019,2020]
y = [8679910968.83,8215896691.46]
plt.title('revenue changing during 2019-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
x = [2019,2020]
y = [1806971546.97,2272208133.84]
plt.title('revenue changing during 2019-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
x = [2019,2020]
y = [435349250.51,518723551.37]
plt.title('revenue changing during 2019-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
x = [2019,2020]
y = [8145952951.58,12466276299.40]
plt.title('revenue changing during 2019-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()